import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_study_06_dushu.items import ScrapyStudy06DushuItem


class ReadSpider(CrawlSpider):
    name = "read"
    allowed_domains = ["www.dushu.com"]
    # 原网页是1188但是不符合我们下面的正则规则，所以原网页会丢失，现在改成1188_1
    start_urls = ["https://www.dushu.com/book/1188_1.html"]

    # follow参数：follow=true 是否跟进就是按照提取连接规则进行提取
    rules = (Rule(LinkExtractor(
        allow=r"/book/1188_\d+\.html"),
        callback="parse_item",
        follow=False),)

    def parse_item(self, response):
        img_list = response.xpath("//div[@class='bookslist']//img")
        for img in img_list:
            name = img.xpath("./@alt").extract_first()
            src = img.xpath("./@data-original").extract_first()
            book = ScrapyStudy06DushuItem(name=name, src=src)
            yield book
